import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn import tree
from sklearn.pipeline import Pipeline, make_pipeline
from statsmodels.formula.api import ols
import statsmodels.stats.api as sms
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from xgboost import XGBClassifier
from icecream import ic
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
df = pd.read_csv("../../data/BankChurners.csv")
print(f'There are {df.shape[0]} rows and {df.shape[1]} columns.') # f-string
# I'm now going to look at 10 random rows
# I'm setting the random seed via np.random.seed so that
# I get the same random results every time
np.random.seed(2)
df.sample(n=10)
There are 10127 rows and 21 columns.
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 7862 | 711589758 | Existing Customer | 42 | M | 5 | High School | Married | $60K - $80K | Blue | 32 | 1 | 2 | 2 | 4162.0 | 1641 | 2521.0 | 0.719 | 3967 | 75 | 0.923 | 0.394 |
| 3536 | 717886008 | Existing Customer | 63 | M | 1 | High School | Single | Less than $40K | Blue | 51 | 5 | 3 | 2 | 6102.0 | 0 | 6102.0 | 0.639 | 4527 | 67 | 0.558 | 0.000 |
| 9770 | 720852108 | Existing Customer | 46 | M | 4 | High School | Divorced | $120K + | Blue | 31 | 2 | 3 | 1 | 3569.0 | 1553 | 2016.0 | 0.798 | 16695 | 116 | 0.681 | 0.435 |
| 8909 | 719785683 | Existing Customer | 41 | M | 4 | Post-Graduate | Married | $80K - $120K | Blue | 36 | 1 | 2 | 2 | 21751.0 | 1573 | 20178.0 | 0.878 | 8332 | 102 | 0.569 | 0.072 |
| 709 | 780054258 | Existing Customer | 40 | M | 5 | Graduate | Unknown | $40K - $60K | Blue | 25 | 6 | 2 | 3 | 7860.0 | 541 | 7319.0 | 0.764 | 1367 | 35 | 0.750 | 0.069 |
| 975 | 717180633 | Existing Customer | 59 | M | 1 | Post-Graduate | Married | $40K - $60K | Blue | 36 | 4 | 2 | 2 | 2483.0 | 1372 | 1111.0 | 1.642 | 1704 | 35 | 0.458 | 0.553 |
| 32 | 709029408 | Existing Customer | 41 | M | 4 | Graduate | Married | $60K - $80K | Blue | 36 | 4 | 1 | 2 | 8923.0 | 2517 | 6406.0 | 1.726 | 1589 | 24 | 1.667 | 0.282 |
| 9454 | 708510858 | Existing Customer | 60 | F | 1 | Uneducated | Single | Less than $40K | Blue | 47 | 1 | 3 | 2 | 4905.0 | 2413 | 2492.0 | 0.853 | 15478 | 109 | 0.730 | 0.492 |
| 4548 | 781297983 | Existing Customer | 58 | M | 2 | Unknown | Divorced | $60K - $80K | Blue | 52 | 6 | 2 | 3 | 20410.0 | 1196 | 19214.0 | 0.726 | 3525 | 78 | 0.733 | 0.059 |
| 9351 | 789983133 | Existing Customer | 31 | M | 2 | Graduate | Unknown | $80K - $120K | Silver | 22 | 2 | 3 | 2 | 34516.0 | 1780 | 32736.0 | 0.839 | 14185 | 98 | 0.690 | 0.052 |
df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})
What do the data types look like?
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null int64 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 10127 non-null object 6 Marital_Status 10127 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(11), object(5) memory usage: 1.6+ MB
pd.DataFrame(
data ={
"% of Missing Values": round(df.isna().sum() / df.isna().count() * 100, 2)
}
).sort_values(by="% of Missing Values", ascending=False)
| % of Missing Values | |
|---|---|
| CLIENTNUM | 0.0 |
| Months_Inactive_12_mon | 0.0 |
| Total_Ct_Chng_Q4_Q1 | 0.0 |
| Total_Trans_Ct | 0.0 |
| Total_Trans_Amt | 0.0 |
| Total_Amt_Chng_Q4_Q1 | 0.0 |
| Avg_Open_To_Buy | 0.0 |
| Total_Revolving_Bal | 0.0 |
| Credit_Limit | 0.0 |
| Contacts_Count_12_mon | 0.0 |
| Total_Relationship_Count | 0.0 |
| Attrition_Flag | 0.0 |
| Months_on_book | 0.0 |
| Card_Category | 0.0 |
| Income_Category | 0.0 |
| Marital_Status | 0.0 |
| Education_Level | 0.0 |
| Dependent_count | 0.0 |
| Gender | 0.0 |
| Customer_Age | 0.0 |
| Avg_Utilization_Ratio | 0.0 |
df.nunique().sort_values(ascending=False)
CLIENTNUM 10127 Avg_Open_To_Buy 6813 Credit_Limit 6205 Total_Trans_Amt 5033 Total_Revolving_Bal 1974 Total_Amt_Chng_Q4_Q1 1158 Avg_Utilization_Ratio 964 Total_Ct_Chng_Q4_Q1 830 Total_Trans_Ct 126 Customer_Age 45 Months_on_book 44 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Education_Level 7 Income_Category 6 Dependent_count 6 Total_Relationship_Count 6 Card_Category 4 Marital_Status 4 Attrition_Flag 2 Gender 2 dtype: int64
df.groupby(['Card_Category', 'Attrition_Flag'])['Attrition_Flag'].count()
Card_Category Attrition_Flag
Blue 0 7917
1 1519
Gold 0 95
1 21
Platinum 0 15
1 5
Silver 0 473
1 82
Name: Attrition_Flag, dtype: int64
Attrition Ranking for category owned & number of attrited:
Blue -> Silver -> Gold -> Platinum
It's clear that the blue card does the best in terms of volume, but it is also the one with the highest churn.
df.groupby(['Card_Category'])['Attrition_Flag'].mean() * 100
Card_Category Blue 16.097923 Gold 18.103448 Platinum 25.000000 Silver 14.774775 Name: Attrition_Flag, dtype: float64
The Attrition _Rates for card category tell a slightly different story in terms of ranking:
Platinum -> Gold -> Blue -> Silver
df.describe()
| CLIENTNUM | Attrition_Flag | Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1.012700e+04 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 | 10127.000000 |
| mean | 7.391776e+08 | 0.160660 | 46.325960 | 2.346203 | 35.928409 | 3.812580 | 2.341167 | 2.455317 | 8631.953698 | 1162.814061 | 7469.139637 | 0.759941 | 4404.086304 | 64.858695 | 0.712222 | 0.274894 |
| std | 3.690378e+07 | 0.367235 | 8.016814 | 1.298908 | 7.986416 | 1.554408 | 1.010622 | 1.106225 | 9088.776650 | 814.987335 | 9090.685324 | 0.219207 | 3397.129254 | 23.472570 | 0.238086 | 0.275691 |
| min | 7.080821e+08 | 0.000000 | 26.000000 | 0.000000 | 13.000000 | 1.000000 | 0.000000 | 0.000000 | 1438.300000 | 0.000000 | 3.000000 | 0.000000 | 510.000000 | 10.000000 | 0.000000 | 0.000000 |
| 25% | 7.130368e+08 | 0.000000 | 41.000000 | 1.000000 | 31.000000 | 3.000000 | 2.000000 | 2.000000 | 2555.000000 | 359.000000 | 1324.500000 | 0.631000 | 2155.500000 | 45.000000 | 0.582000 | 0.023000 |
| 50% | 7.179264e+08 | 0.000000 | 46.000000 | 2.000000 | 36.000000 | 4.000000 | 2.000000 | 2.000000 | 4549.000000 | 1276.000000 | 3474.000000 | 0.736000 | 3899.000000 | 67.000000 | 0.702000 | 0.176000 |
| 75% | 7.731435e+08 | 0.000000 | 52.000000 | 3.000000 | 40.000000 | 5.000000 | 3.000000 | 3.000000 | 11067.500000 | 1784.000000 | 9859.000000 | 0.859000 | 4741.000000 | 81.000000 | 0.818000 | 0.503000 |
| max | 8.283431e+08 | 1.000000 | 73.000000 | 5.000000 | 56.000000 | 6.000000 | 6.000000 | 6.000000 | 34516.000000 | 2517.000000 | 34516.000000 | 3.397000 | 18484.000000 | 139.000000 | 3.714000 | 0.999000 |
Average utilization rate is a little on the higher side -> good would be under 20%.
#wholeReport = pandas_profiling.ProfileReport(df).to_file("wholeReport.html")
pandas_profiling.ProfileReport(df)